library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5.9000
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
It is current practice to measure many variables on the same patients, we may have all the biometrical characteristics, height, weight, BMI, age as well as clinical variables such as blood pressure, blood sugar, heart rate for 100 patients, these variables will not be independent.
To start off, a useful toy example we’ll use is from the sports world; performances of decathlon athletes.
These are measurements of athletes’ performances in the decathlon: the variables m100, m400, m1500 are performance times in seconds for the 100 metres, 400 metres and 1500 meters respectively, ‘m110‘ is the time taken to finish the 110 meters hurdles whereas pole is the pole-jump height, weight is the length in metres the athletes threw the weight.
| m100 | long | weight | highj | m400 | m110 | disc | pole | jave | m1500 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 11.25 | 7.43 | 15.48 | 2.27 | 48.90 | 15.13 | 49.28 | 4.70 | 61.32 | 268.95 |
| 2 | 10.87 | 7.45 | 14.97 | 1.97 | 47.71 | 14.46 | 44.36 | 5.10 | 61.76 | 273.02 |
| 3 | 11.18 | 7.44 | 14.20 | 1.97 | 48.29 | 14.81 | 43.66 | 5.20 | 64.16 | 263.20 |
| 4 | 10.62 | 7.38 | 15.02 | 2.03 | 49.06 | 14.72 | 44.80 | 4.90 | 64.04 | 285.11 |
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
diabetes=read.table(url("http://bios221.stanford.edu/data/diabetes.txt"),
header=TRUE,row.names=1)
diabetes[1:4,]
## relwt glufast glutest steady insulin Group
## 1 0.81 80 356 124 55 3
## 3 0.94 105 319 143 105 3
## 5 1.00 90 323 240 143 3
## 7 0.91 100 350 221 119 3
data("GlobalPatterns", package = "phyloseq")
GPOTUs = as.matrix(t(phyloseq::otu_table(GlobalPatterns)))
GPOTUs[1:4, 6:13]
## OTU Table: [4 taxa and 8 samples]
## taxa are rows
## 246140 143239 244960 255340 144887 141782 215972 31759
## CL3 0 7 0 153 3 9 0 0
## CC1 0 1 0 194 5 35 3 1
## SV1 0 0 0 0 0 0 0 0
## M31Fcsw 0 0 0 0 0 0 0 0
Here are some RNA-seq transcriptomic data showing numbers of mRNA reads present for different patient samples, the rows are patients and the columns are the genes.
FBgn0000017 FBgn0000018 FBgn0000022 FBgn0000024 FBgn0000028 FBgn0000032
untreated1 4664 583 0 10 0 1446
untreated2 8714 761 1 11 1 1713
untreated4 3150 310 0 3 0 672
treated1 6205 722 0 10 0 1698
treated3 3334 308 0 5 1 757Mass spectroscopy data where we have samples containing informative labels (knockout versus wildtype mice) and protein \(\times\) features designated by their m/z number.
mz 129.9816 72.08144 151.6255 142.0349 169.0413 186.0355
KOGCHUM1 60515 181495 0 196526 25500 51504.40
WTGCHUM1 252579 54697 412 487800 48775 130491.15
WTGCHUM2 187859 56318 46425 454226 45626 100845.01#######Melanoma/Tcell Data: Peter Lee, Susan Holmes, PNAS.
load(url("http://bios221.stanford.edu/data/Msig3transp.RData"))
round(Msig3transp,2)[1:5,1:6]
## X3968 X14831 X13492 X5108 X16348 X585
## HEA26_EFFE_1 -2.61 -1.19 -0.06 -0.15 0.52 -0.02
## HEA26_MEM_1 -2.26 -0.47 0.28 0.54 -0.37 0.11
## HEA26_NAI_1 -0.27 0.82 0.81 0.72 -0.90 0.75
## MEL36_EFFE_1 -2.24 -1.08 -0.24 -0.18 0.64 0.01
## MEL36_MEM_1 -2.68 -0.15 0.25 0.95 -0.20 0.17
celltypes=factor(substr(rownames(Msig3transp),7,9))
status=factor(substr(rownames(Msig3transp),1,3))
#house=read.table("/Users/susan/Dropbox/CaseStudies/votes.txt")
#head(house[,1:5])
#party=scan("/Users/susan/Dropbox/CaseStudies/party.txt")
#table(party)
library(dplyr)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
my_mtcars <- select(mtcars, mpg, wt, qsec, cyl, gear, am) %>%
mutate(cyl = factor(cyl), gear = factor(gear), am = factor(am))
ggpairs(my_mtcars, aes(color = cyl))
library(dplyr)
library(GGally)
my_mtcars <- select(mtcars, mpg, wt, qsec)
ggpairs(my_mtcars)
my_mtcars <- select(mtcars, mpg, wt, qsec, hp)
apply(my_mtcars, 2, mean)
## mpg wt qsec hp
## 20.09062 3.21725 17.84875 146.68750
apply(my_mtcars, 2, sd)
## mpg wt qsec hp
## 6.0269481 0.9784574 1.7869432 68.5628685
my_mtcars_centered = scale(my_mtcars, center = TRUE, scale = FALSE)
apply(my_mtcars_centered, 2, mean)
## mpg wt qsec hp
## 4.440892e-16 3.469447e-17 9.436896e-16 0.000000e+00
apply(my_mtcars_centered, 2, sd)
## mpg wt qsec hp
## 6.0269481 0.9784574 1.7869432 68.5628685
my_mtcars_centered_scaled = scale(my_mtcars)
apply(my_mtcars_centered_scaled, 2, mean)
## mpg wt qsec hp
## 7.112366e-17 4.681043e-17 5.299580e-16 1.040834e-17
apply(my_mtcars_centered_scaled, 2, sd)
## mpg wt qsec hp
## 1 1 1 1
my_mtcars_centered_scaled <- as.data.frame(my_mtcars_centered_scaled)
my_mtcars_centered_scaled$cyl <- factor(mtcars$cyl)
ggplot(my_mtcars_centered_scaled,
aes(x = mpg, y = wt, col = cyl)) +
geom_point(size = 3) +
theme(text = element_text(size = 30))
ggplot(mtcars,
aes(x = mpg, y = wt, col = factor(cyl))) +
geom_point(size = 3) +
theme(text = element_text(size = 30)) +
scale_color_discrete(name = "cyl")
library(ade4)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
my_mtcars <- select(mtcars, mpg, wt, disp, hp, qsec, drat)
pca_mtcars = dudi.pca(my_mtcars, scannf = FALSE, nf = 3)
fviz_eig(pca_mtcars, geom = "bar", bar_width = 0.7) + ggtitle("")
fviz_pca_ind(pca_mtcars, repel = FALSE) +
coord_fixed() +
ylim(-3.5, 3.5) + xlim(-5, 5) +
theme(text = element_text(size = 15))
fviz_pca_ind(pca_mtcars,
col.ind = factor(mtcars$am),
repel = TRUE) + # TRUE to avoid text overlapping (slow if many points)
coord_fixed() +
ylim(-3.5, 3.5) + xlim(-5, 5) +
theme(text = element_text(size = 15))
fviz_pca_biplot(pca_mtcars, repel = TRUE, arrowsize = 1) +
coord_fixed() +
ylim(-3.5, 3.5) + xlim(-5, 5) +
theme(text = element_text(size = 15))
fviz_pca_biplot(pca_mtcars,
geom = "point",
arrowsize = 1,
pointsize = 2) +
coord_fixed() +
ylim(-3.5, 3.5) + xlim(-5, 5) +
theme(text = element_text(size = 15))
fviz_pca_biplot(pca_mtcars,
col.ind = factor(mtcars$cyl),
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
geom = "point",
arrowsize = 1,pointsize = 2,
addEllipses = TRUE,
ellipse.level = 0.6,
# ellipse.type = "confidence",
legend.title = "No. cylinders") +
coord_fixed() +
ylim(-3.5, 3.5) + xlim(-5, 5) +
theme(text = element_text(size = 15))
cor(as.matrix(my_mtcars)) %>% round(1)
## mpg wt disp hp qsec drat
## mpg 1.0 -0.9 -0.8 -0.8 0.4 0.7
## wt -0.9 1.0 0.9 0.7 -0.2 -0.7
## disp -0.8 0.9 1.0 0.8 -0.4 -0.7
## hp -0.8 0.7 0.8 1.0 -0.7 -0.4
## qsec 0.4 -0.2 -0.4 -0.7 1.0 0.1
## drat 0.7 -0.7 -0.7 -0.4 0.1 1.0
fviz_pca_var(pca_mtcars)
# Color by cos2 values: quality on the factor map
fviz_pca_var(pca_mtcars, col.var = "cos2",
gradient.cols = viridis::viridis(100)[1:95],
repel = TRUE # Avoid text overlapping
)
library(ade4)
library(factoextra)
load("mat1xcms.RData")
pcamat1 = dudi.pca(t(mat1), scannf = FALSE, nf = 3)
fviz_eig(pcamat1, geom = "bar", bar_width = 0.7) + ggtitle("")
dfmat1 = cbind(pcamat1$li, tibble(
label = rownames(pcamat1$li),
number = substr(label, 3, 4),
type = factor(substr(label, 1, 2))))
pcsplot = ggplot(dfmat1,
aes(x=Axis1, y=Axis2, label=label, group=number, colour=type)) +
geom_text(size = 4, vjust = -0.5) + geom_point(size = 3)
pcsplot + geom_hline(yintercept = 0, linetype = 2) +
geom_vline(xintercept = 0, linetype = 2) +
coord_fixed()
fviz_pca_ind(pcamat1) + ggtitle("")
fviz_pca_var(pcamat1, col.circle="black") + ggtitle("")
data("x", package = "Hiiragi2013")
xwt = x[, x$genotype == "WT"]
sel = order(genefilter::rowVars(Biobase::exprs(xwt)), decreasing = TRUE)[1:100]
xwt = xwt[sel, ]
tab = table(xwt$sampleGroup)
tab
##
## E3.25 E3.5 (EPI) E3.5 (PE) E4.5 (EPI) E4.5 (PE)
## 36 11 11 4 4
pcaMouse = dudi.pca(as.data.frame(t(Biobase::exprs(xwt))),
center = TRUE, scale = TRUE, nf = 2,
scannf = FALSE)
fviz_eig(pcaMouse)
xwt$weight = 1 / as.numeric(tab[xwt$sampleGroup])
pcaMouse_wt = dudi.pca(as.data.frame(t(Biobase::exprs(xwt))),
row.w = xwt$weight,
center = TRUE, scale = TRUE, nf = 2,
scannf = FALSE)
fviz_eig(pcaMouse_wt)
fviz_pca_ind(pcaMouse, geom = "point", col.ind = xwt$sampleGroup) +
coord_fixed()
fviz_pca_ind(pcaMouse_wt, geom = "point", col.ind = xwt$sampleGroup) +
coord_fixed()
library("pheatmap")
load("wine.RData")
load("wineClass.RData")
head(wine)
## Alcohol MalicAcid Ash AlcAsh Mg Phenols Flav NonFlavPhenols Proa Color
## 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64
## 2 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38
## 3 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68
## 4 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80
## 5 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32
## 6 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75
## Hue OD Proline
## 1 1.04 3.92 1065
## 2 1.05 3.40 1050
## 3 1.03 3.17 1185
## 4 0.86 3.45 1480
## 5 1.04 2.93 735
## 6 1.05 2.85 1450
load("wine.RData")
apply(wine, 2, mean)
## Alcohol MalicAcid Ash AlcAsh Mg
## 13.0006180 2.3363483 2.3665169 19.4949438 99.7415730
## Phenols Flav NonFlavPhenols Proa Color
## 2.2951124 2.0292697 0.3618539 1.5908989 5.0580899
## Hue OD Proline
## 0.9574494 2.6116854 746.8932584
apply(wine, 2, sd)
## Alcohol MalicAcid Ash AlcAsh Mg
## 0.8118265 1.1171461 0.2743440 3.3395638 14.2824835
## Phenols Flav NonFlavPhenols Proa Color
## 0.6258510 0.9988587 0.1244533 0.5723589 2.3182859
## Hue OD Proline
## 0.2285716 0.7099904 314.9074743
wine_scaled = scale(wine)
apply(wine_scaled, 2, mean)
## Alcohol MalicAcid Ash AlcAsh Mg
## -8.591766e-16 -6.776446e-17 8.045176e-16 -7.720494e-17 -4.073935e-17
## Phenols Flav NonFlavPhenols Proa Color
## -1.395560e-17 6.958263e-17 -1.042186e-16 -1.221369e-16 3.649376e-17
## Hue OD Proline
## 2.093741e-16 3.003459e-16 -1.034429e-16
apply(wine_scaled, 2, sd)
## Alcohol MalicAcid Ash AlcAsh Mg
## 1 1 1 1 1
## Phenols Flav NonFlavPhenols Proa Color
## 1 1 1 1 1
## Hue OD Proline
## 1 1 1
library(pheatmap)
pheatmap(1 - cor(wine))
wine_scaled <- as.data.frame(wine_scaled)
GGally::ggpairs(wine_scaled)
library(ade4)
winePCA = dudi.pca(wine, nf = 5, scale = TRUE, center = TRUE, scannf=FALSE)
class(winePCA)
## [1] "pca" "dudi"
names(winePCA)
## [1] "tab" "cw" "lw" "eig" "rank" "nf" "c1" "li" "co" "l1"
## [11] "call" "cent" "norm"
library(factoextra)
fviz_eig(winePCA)
#eig_ratio <- winePCA$eig[2]/winePCA$eig[1]
fviz_pca_ind(winePCA) +
coord_fixed()
load("wineClass.RData")
table(wine.class)
## wine.class
## barolo grignolino barbera
## 59 71 48
fviz_pca_ind(winePCA, col.ind = wine.class,
palette = c("#E41A1C", "#377EB8", "#4DAF4A")) +
coord_fixed()
fviz_pca_ind(winePCA, col.ind = wine.class,
palette = c("#E41A1C", "#377EB8", "#4DAF4A"),
addEllipses = TRUE, ellipse.level = 0.9) +
coord_fixed()
fviz_pca_ind(winePCA, axes = 2:3, col.ind = wine.class,
palette = c("#E41A1C", "#377EB8", "#4DAF4A")) +
coord_fixed()
fviz_pca_var(winePCA)
fviz_contrib(winePCA, choice = "var", axes = 1)
fviz_contrib(winePCA, choice = "var", axes = 2)
fviz_contrib(winePCA, choice = "var", axes = 1:2)
fviz_pca_biplot(
winePCA, geom = "point",
col.ind = wine.class,
col.var = "#c07d44",
addEllipses = TRUE, ellipse.level = 0.7) +
coord_fixed() +
scale_color_brewer(palette = "Set1")
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pheatmap_1.0.8 factoextra_1.0.5 GGally_1.4.0 ade4_1.7-11
## [5] forcats_0.3.0 stringr_1.3.0 dplyr_0.7.5.9000 purrr_0.2.4
## [9] readr_1.1.1 tidyr_0.8.0 tibble_1.4.2 ggplot2_3.0.0
## [13] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 bitops_1.0-6 phyloseq_1.23.1
## [4] bit64_0.9-7 lubridate_1.7.4 RColorBrewer_1.1-2
## [7] httr_1.3.1 rprojroot_1.3-2 tools_3.5.0
## [10] backports_1.1.2 R6_2.2.2 vegan_2.5-2
## [13] DBI_1.0.0 lazyeval_0.2.1 BiocGenerics_0.26.0
## [16] mgcv_1.8-23 colorspace_1.3-2 permute_0.9-4
## [19] withr_2.1.2 tidyselect_0.2.4 gridExtra_2.3
## [22] mnormt_1.5-5 bit_1.1-12 compiler_3.5.0
## [25] cli_1.0.0 rvest_0.3.2 Biobase_2.40.0
## [28] xml2_1.2.0 labeling_0.3 scales_0.5.0
## [31] psych_1.8.3.3 genefilter_1.62.0 digest_0.6.15
## [34] foreign_0.8-70 rmarkdown_1.9 XVector_0.20.0
## [37] pkgconfig_2.0.1 htmltools_0.3.6 rlang_0.2.0.9001
## [40] readxl_1.1.0 RSQLite_2.1.1 rstudioapi_0.7
## [43] bindr_0.1.1 jsonlite_1.5 RCurl_1.95-4.10
## [46] magrittr_1.5 biomformat_1.8.0 Matrix_1.2-14
## [49] Rcpp_0.12.17 munsell_0.4.3 S4Vectors_0.18.3
## [52] Rhdf5lib_1.2.1 ape_5.1 viridis_0.5.1
## [55] stringi_1.1.7 yaml_2.1.18 MASS_7.3-49
## [58] zlibbioc_1.26.0 rhdf5_2.24.0 plyr_1.8.4
## [61] blob_1.1.1 grid_3.5.0 parallel_3.5.0
## [64] ggrepel_0.7.0 crayon_1.3.4 lattice_0.20-35
## [67] Biostrings_2.48.0 haven_1.1.1 splines_3.5.0
## [70] annotate_1.58.0 multtest_2.36.0 hms_0.4.2
## [73] knitr_1.20 pillar_1.2.2 igraph_1.2.1
## [76] ggpubr_0.1.6 reshape2_1.4.3 codetools_0.2-15
## [79] stats4_3.5.0 XML_3.98-1.11 glue_1.2.0
## [82] evaluate_0.10.1 data.table_1.11.4 modelr_0.1.1
## [85] foreach_1.4.4 cellranger_1.1.0 gtable_0.2.0
## [88] reshape_0.8.7 assertthat_0.2.0 xtable_1.8-2
## [91] broom_0.4.4 viridisLite_0.3.0 survival_2.42-3
## [94] iterators_1.0.9 memoise_1.1.0 AnnotationDbi_1.42.1
## [97] IRanges_2.14.10 bindrcpp_0.2.2 cluster_2.0.7-1